Dataset¶
Imports¶
In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import chi2_contingency, shapiro, levene, mannwhitneyu
import warnings
warnings.filterwarnings("ignore")
# Color schemas
colorcategories = ['#A6C8E0', '#3182BD','#1D3B5D']
colorback = 'rgba(0,0,0,0)'
colortext = '#36414e'
fsize = 12
Exploratory analysis¶
In [6]:
# Read data
df = pd.read_csv("marketing_AB.csv")
In [7]:
# Check first rows
df.head()
Out[7]:
| Unnamed: 0 | user id | test group | converted | total ads | most ads day | most ads hour | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 1069124 | ad | False | 130 | Monday | 20 |
| 1 | 1 | 1119715 | ad | False | 93 | Tuesday | 22 |
| 2 | 2 | 1144181 | ad | False | 21 | Tuesday | 18 |
| 3 | 3 | 1435133 | ad | False | 355 | Tuesday | 10 |
| 4 | 4 | 1015700 | ad | False | 276 | Friday | 14 |
In [8]:
# Check null values and data types
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 588101 entries, 0 to 588100 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 588101 non-null int64 1 user id 588101 non-null int64 2 test group 588101 non-null object 3 converted 588101 non-null bool 4 total ads 588101 non-null int64 5 most ads day 588101 non-null object 6 most ads hour 588101 non-null int64 dtypes: bool(1), int64(4), object(2) memory usage: 27.5+ MB
In [9]:
# Check if user id is unique
df['user id'].is_unique
Out[9]:
True
In [10]:
# Drop unwanted columns
df = df.drop(columns={"Unnamed: 0", "user id"})
In [11]:
df.columns
Out[11]:
Index(['test group', 'converted', 'total ads', 'most ads day',
'most ads hour'],
dtype='object')
In [12]:
# Create a dataframe with only categorical variables
df_cat = df[['test group', 'converted', 'most ads day', 'most ads hour']]
df_cat.nunique()
Out[12]:
test group 2 converted 2 most ads day 7 most ads hour 24 dtype: int64
In [13]:
for i in df_cat.columns:
print(i, ':',df_cat[i].unique())
test group : ['ad' 'psa'] converted : [False True] most ads day : ['Monday' 'Tuesday' 'Friday' 'Saturday' 'Wednesday' 'Sunday' 'Thursday'] most ads hour : [20 22 18 10 14 13 19 11 12 16 21 3 23 4 8 0 2 15 1 6 17 7 9 5]
In [14]:
# Test group percentage
fig = px.histogram(df_cat, x ='test group', color_discrete_sequence=colorcategories, text_auto='.1f', histnorm='percent')
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='test group',
yaxis_title='%',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=600,
height=400,
showlegend=False)
fig.show()
In [15]:
# Test group number
fig = px.histogram(df_cat, x ='test group', color_discrete_sequence=colorcategories, text_auto='.0f')
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='test group',
yaxis_title='#',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=600,
height=400,
showlegend=False)
fig.show()
In [16]:
# Converted percentage
fig = px.histogram(df_cat, x='converted', color_discrete_sequence=colorcategories, text_auto='.1f', histnorm='percent')
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='converted',
yaxis_title='%',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=600,
height=400,
showlegend=False)
fig.show()
In [17]:
# Test group number
fig = px.histogram(df_cat, x='converted', color_discrete_sequence=colorcategories, text_auto='.0f')
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='converted',
yaxis_title='#',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=600,
height=400,
showlegend=False)
fig.show()
In [18]:
# Most ads day percentage
fig = px.histogram(df_cat, x ='most ads day', color_discrete_sequence=colorcategories, text_auto='.1f', histnorm='percent')
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='most ads day',
yaxis_title='%',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=600,
height=400,
showlegend=False)
fig.show()
In [19]:
# Most ads day number
fig = px.histogram(df_cat, x='most ads day', color_discrete_sequence=colorcategories, text_auto='.0f')
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='most ads day',
yaxis_title='#',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=600,
height=400,
showlegend=False)
fig.show()
In [20]:
# Most ads hour percentage
fig = px.histogram(df_cat, x='most ads hour', color_discrete_sequence= colorcategories,
text_auto='.1f', histnorm='percent', barmode='group')
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='most ads hour',
yaxis_title='%',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=1000,
height=400,
showlegend=False,
bargap=0.3)
fig.update_traces(textangle=0)
fig.show()
In [21]:
# Most ads day number
fig = px.histogram(df_cat, x='most ads hour', color_discrete_sequence=colorcategories, text_auto='.0f', barmode='group')
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='most ads hour',
yaxis_title='#',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=1000,
height=400,
showlegend=False,
bargap=0.3)
fig.update_traces(textangle=-90)
fig.show()
In [22]:
# Total ads number
fig = px.histogram(df, x='total ads', color_discrete_sequence=colorcategories)
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='total ads',
yaxis_title='#',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=1000,
height=400,
showlegend=False)
fig.show()
In [23]:
# Total ads boxplot
fig = px.box(df, y='total ads', color_discrete_sequence=colorcategories)
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='',
yaxis_title='total ads',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=400,
height=1000,
showlegend=False)
fig.show()
In [24]:
df['total ads'].describe()
Out[24]:
count 588101.000000 mean 24.820876 std 43.715181 min 1.000000 25% 4.000000 50% 13.000000 75% 27.000000 max 2065.000000 Name: total ads, dtype: float64
In [25]:
# Total ads number (reduced x axis range)
fig = px.histogram(df, x='total ads', color_discrete_sequence=colorcategories)
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='total ads',
yaxis_title='#',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=1000,
height=400,
xaxis_range=[0,50],
showlegend=False)
fig.show()
In [26]:
# Total ads boxplot (reduced y axis range)
fig = px.box(df, y='total ads', color_discrete_sequence=colorcategories)
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='',
yaxis_title='total ads',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=400,
height=1000,
yaxis_range=[0,80],
showlegend=False)
fig.show()
Bivariate analysis¶
In [28]:
df.columns
Out[28]:
Index(['test group', 'converted', 'total ads', 'most ads day',
'most ads hour'],
dtype='object')
In [29]:
ct_conversion_test_group = pd.crosstab(df['test group'], df['converted'], normalize='index')
ct_conversion_test_group.sort_values(by=True, ascending=False)
Out[29]:
| converted | False | True |
|---|---|---|
| test group | ||
| ad | 0.974453 | 0.025547 |
| psa | 0.982146 | 0.017854 |
In [30]:
ct_conversion_most_ads_day = pd.crosstab(df['most ads day'], df['converted'], normalize='index')
ct_conversion_most_ads_day.sort_values(by=True, ascending=False)
Out[30]:
| converted | False | True |
|---|---|---|
| most ads day | ||
| Monday | 0.967188 | 0.032812 |
| Tuesday | 0.970160 | 0.029840 |
| Wednesday | 0.975058 | 0.024942 |
| Sunday | 0.975524 | 0.024476 |
| Friday | 0.977788 | 0.022212 |
| Thursday | 0.978429 | 0.021571 |
| Saturday | 0.978949 | 0.021051 |
In [31]:
ct_conversion_most_ads_hour = pd.crosstab(df['most ads hour'], df['converted'], normalize='index')
ct_conversion_most_ads_hour.sort_values(by=True, ascending=False)
Out[31]:
| converted | False | True |
|---|---|---|
| most ads hour | ||
| 16 | 0.969228 | 0.030772 |
| 20 | 0.970197 | 0.029803 |
| 15 | 0.970347 | 0.029653 |
| 21 | 0.971077 | 0.028923 |
| 17 | 0.971790 | 0.028210 |
| 14 | 0.971937 | 0.028063 |
| 18 | 0.972620 | 0.027380 |
| 19 | 0.973280 | 0.026720 |
| 22 | 0.973895 | 0.026105 |
| 13 | 0.975323 | 0.024677 |
| 12 | 0.976172 | 0.023828 |
| 23 | 0.977338 | 0.022662 |
| 6 | 0.977756 | 0.022244 |
| 11 | 0.977884 | 0.022116 |
| 10 | 0.978479 | 0.021521 |
| 5 | 0.979085 | 0.020915 |
| 8 | 0.980484 | 0.019516 |
| 9 | 0.980809 | 0.019191 |
| 0 | 0.981575 | 0.018425 |
| 7 | 0.981889 | 0.018111 |
| 4 | 0.984765 | 0.015235 |
| 1 | 0.987089 | 0.012911 |
| 3 | 0.989548 | 0.010452 |
| 2 | 0.992687 | 0.007313 |
In [32]:
# Total ads boxplot
fig = px.box(df, y='total ads', x='converted', color_discrete_sequence=colorcategories)
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='converted',
yaxis_title='total ads',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=400,
height=1000,
showlegend=False)
fig.show()
In [33]:
# Total ads boxplot (reduced y axis range)
fig = px.box(df, y='total ads', x='converted', color_discrete_sequence=colorcategories)
fig.update_layout(font_size=fsize,
font_color=colortext,
title='',
xaxis_title='converted',
yaxis_title='total ads',
paper_bgcolor=colorback,
plot_bgcolor=colorback,
width=400,
height=1000,
yaxis_range=[0,250],
showlegend=False)
fig.show()
Statistical test¶
In [35]:
df_cat.columns.drop('converted')
Out[35]:
Index(['test group', 'most ads day', 'most ads hour'], dtype='object')
In [36]:
for variable in df_cat.columns.drop('converted'):
contingency_table=pd.crosstab(df[variable],df['converted'])
# Chi-Square Test
chi2, p = chi2_contingency(contingency_table)[0:2]
print(f"Variable: {variable}")
print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p}")
# Decision
if p < 0.05:
print(f"Reject Null Hypothesis: {variable} and converted variables are dependent.")
print("\n")
else:
print(f"Fail to Reject Null Hypothesis: {variable} and converted variable are independent.")
print("\n")
Variable: test group Chi2 Statistic: 54.005823883685245 P-value: 1.9989623063390078e-13 Reject Null Hypothesis: test group and converted variables are dependent. Variable: most ads day Chi2 Statistic: 410.0478857936585 P-value: 1.932184379244731e-85 Reject Null Hypothesis: most ads day and converted variables are dependent. Variable: most ads hour Chi2 Statistic: 430.76869230822086 P-value: 8.027629823696774e-77 Reject Null Hypothesis: most ads hour and converted variables are dependent.
In [37]:
# Check the normality of the distribution
shapiro_converted_stat, shapiro_converted_p = shapiro(df[df['converted']==True]['total ads'])
print(f"Shapiro Statistic for converted: {shapiro_converted_stat}")
print(f"Shapiro p-value for converted: {shapiro_converted_p}")
if shapiro_converted_p <0.05:
print("Reject H0: Data is not normally distributed.")
else:
print("Fail to reject H0: Data is normally disctributed.")
shapiro_not_converted_stat, shapiro_not_converted_p = shapiro(df[df['converted']==False]['total ads'])
print(f"Shapiro Statistic for not converted: {shapiro_not_converted_stat}")
print(f"Shapiro p-value for not converted: {shapiro_not_converted_p}")
if shapiro_not_converted_p <0.05:
print("Reject H0: Data is not normally distributed.")
else:
print("Fail to reject H0: Data is normally disctributed.")
Shapiro Statistic for converted: 0.6578396248200824 Shapiro p-value for converted: 1.638680987007771e-98 Reject H0: Data is not normally distributed. Shapiro Statistic for not converted: 0.4746742488927551 Shapiro p-value for not converted: 9.883049430735801e-204 Reject H0: Data is not normally distributed.
In [ ]:
# Levene equality of variance test
levene_stat, levene_p = levene(df[df['converted']==True]['total ads'], df[df['converted']==False]['total ads'])
print(f"Levene Statistic: {levene_stat}")
print(f"Levene p-value: {levene_p}")
if levene_p <0.05:
print("Reject H0: Variances are different.")
else:
print("Fail to reject H0: Variances are equal.")
Levene Statistic: 9121.196956737573 Levene p-value: 0.0 Reject H0: Variances are different.
Based on the above non-parametrical test needs to be used.
In [ ]:
# Mann-Whitney U test
mannwhitneyu_stat, mannwhitneyu_p = mannwhitneyu(df[df['converted']==True]['total ads'], df[df['converted']==False]['total ads'])
print(f"Mann-Whitney U Test Statistic: {mannwhitneyu_stat}")
print(f"Mann-Whitney U Test p-value: {mannwhitneyu_p}")
if mannwhitneyu_p <0.05:
print("Reject H0: Significant difference between the two groups.")
else:
print("Fail to reject H0: No significant difference between the two groups.")
Results¶
To sum up:
- All of the variables have a statistically significant impact on the conversion rate.
- Customers who saw the ad were more likely to convert.
- Customers who saw ads on Monday, Wednesday were more likely to convert.
- Customers who saw ads at 16 and 20 were more likely to convert.
- Customer who saw more ads were more likely to convert.
In [ ]: